In [1]:
from itertools import count
from scipy import sparse
from scipy.linalg import norm
from scipy.sparse.linalg import svds
import numpy as np
import timeit
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline
import time
import json
In [2]:
n = 300000
In [3]:
raw = np.genfromtxt('train_triplets.txt', max_rows=n, dtype="S40,S18,i8")
In [4]:
'''
Construct COOordinate matrix from raw
'''
row = []
col = []
data = []
user_to_row = {}
song_to_col = {}
user_count = count()
song_count = count()
for i, (uid, sid, play_count) in enumerate(raw):
if not uid in user_to_row:
user_to_row[uid] = next(user_count)
row.append(user_to_row[uid])
if not sid in song_to_col:
song_to_col[sid] = next(song_count)
col.append(song_to_col[sid])
data.append(float(play_count))
arr = np.array(data) # transform list to ndarray for conveninence
In [5]:
'''
Preprocess the data by binning the play counts into b bins
and store data in sparse coo matrix
'''
b = 10
for i in range(b):
arr[((arr >= ((1 << i))) & (arr <= ((1 << (i+1)) - 1)))] = (i+1)
arr[arr >= (1 << b)] = b
M = sparse.coo_matrix((arr, (row, col)))
In [6]:
'''
Helper methods for removing users/songs
'''
def removeCol(M):
M = M.tocsc() #transform to CSC for fast column slicing
cols = []
for i in range(M.shape[1]):
nnz = (M.getcol(i)).count_nonzero()
if nnz > 5:
cols.append(i)
M = M[:, cols]
return M
def removeRow(M):
M = M.tocsr() #transform to CSR for fast row slicing
rows = []
for i in range(M.shape[0]):
nnz = (M.getrow(i)).count_nonzero()
if nnz > 5:
rows.append(i)
M = M[rows, :]
return M
In [7]:
'''
Preprocess the data to avoid the cold start issue.
That is, recursively remove all users/songs
that have less or equal than 5 occurrences in M,
until M no longer changes.
'''
prevShape = M.shape
M = removeCol(M)
currShape = M.shape
while prevShape != currShape:
prevShape = currShape
M = removeCol(M)
M = removeRow(M)
currShape = M.shape
In [8]:
M = M.T # transpose M to have users to columns and songs to rows
In [9]:
'''
To properly evaluate our approach set-aside 200 randomly
selected non-zero datapoints from the matrix as a test set.
That is, store their values separately and set them to 0 in M.
'''
M = M.tocsr()
nz_row_indices, nz_col_indices = M.nonzero()
nz_indices = np.vstack((nz_row_indices, nz_col_indices))
np.random.shuffle(nz_indices.T)
test_set_indices = nz_indices[:,:200]
M_test = M[test_set_indices[0,:],test_set_indices[1,:]]
M[test_set_indices[0,:],test_set_indices[1,:]] = 0 # set test instances to zero in M
In [10]:
'''
Additionally create a small validation set. That is, set-aside 1000 randomly selected
non-zero data-points and set them to ? in M. You can use this validation set for
hyper-parameter tuning and to judge the convergence of your methods.
'''
nz_row_indices, nz_col_indices = M.nonzero()
nz_indices = np.vstack((nz_row_indices, nz_col_indices))
np.random.shuffle(nz_indices.T)
val_set_indices = nz_indices[:,:1000]
M_val = M[val_set_indices[0,:], val_set_indices[1,:]]
M[val_set_indices[0,:], val_set_indices[1,:]] = 0
In [11]:
# Functions
class Timer:
def __init__(self):
self.timers = {}
self.next_id = 0
def start(self, msg=""):
print("START %d-timer on: %s" % (self.next_id, msg))
self.timers[self.next_id] = [msg, time.time()]
self.next_id += 1
def end(self):
if self.next_id-1 in self.timers:
end_time = time.time() - self.timers[self.next_id-1][1]
print("END %d-timer on: %s in %.2f sec" % (
self.next_id,
self.timers[self.next_id-1][0],
end_time))
self.next_id -= 1
return end_time
else:
print("NO TIMER started")
return None
#The objective loss
def loss(M, Q, P, nnz_elem, opt):
err = 0
reg_x = 0
reg_i = 0
for idx in range(nnz_elem.shape[1]): # over all non-zero elements in M
i = nnz_elem[0,idx]
j = nnz_elem[1,idx]
err += (M[i, j] - np.dot(Q[i,:], P[:, j]))**2
reg_x += norm(P[:, j])**2
reg_i += norm(Q[i,:])**2
return err + opt['lambda_1']*reg_x + opt['lambda_2']*reg_i
def validation_loss(val_set_indices, M_val, Q, P, opt):
err = 0
reg_x = 0
reg_i = 0
for k in range(val_set_indices.shape[1]):
i = val_set_indices[0,k]
j = val_set_indices[1,k]
err += (M_val[0, k] - np.dot(Q[i,:], P[:, j]))**2
reg_x += norm(P[:, j])**2
reg_i += norm(Q[i,:])**2
return err + opt['lambda_1']*reg_x + opt['lambda_2']*reg_i
def test_error(test_set_indices, M_test, Q, P):
err = 0
for k in range(test_set_indices.shape[1]):
i = test_set_indices[0,k]
j = test_set_indices[1,k]
err += (M_test[0, k] - np.dot(Q[i,:], P[:, j]))**2
return sqrt(err*1.0/test_set_indices.shape[1])
# Standard Gradient Descent
def GD(M, Q, P, nnz_elem, val_set_indices, M_val, opt, timer=None):
l=[]
last_val_loss = np.inf
last_loss = np.inf
status = "Maximum steps reached"
iteration_measured = False
for step in range(opt['steps']):
if not iteration_measured:
timer.start("ITERATION %s" % json.dumps(opt,indent=1))
delta_P = np.zeros_like(P)
delta_Q = np.zeros_like(Q)
for idx in range(nnz_elem.shape[1]):
i = nnz_elem[0,idx]
j = nnz_elem[1,idx]
eij = M[i,j] - np.dot(Q[i,:], P[:,j])
for k in range(opt['factors']):
delta_P[k,j] += opt['learning_rate'] * 2 * (eij * Q[i,k] - opt['lambda_1'] * P[k,j])
delta_Q[i,k] += opt['learning_rate'] * 2 * (eij * P[k,j] - opt['lambda_2'] * Q[i,k])
P += delta_P
Q += delta_Q
if not iteration_measured:
timer.end()
iteration_measured = True
if (step) % opt['epoch'] == 0:
cur_loss = loss(M=M, Q=Q, P=P, nnz_elem=nnz_elem, opt=opt)
l.append(cur_loss)
val_loss = validation_loss(val_set_indices=val_set_indices, M_val=M_val, Q=Q, P=P, opt=opt)
if val_loss > last_val_loss:
status = "Validation loss increased"
break
elif abs(cur_loss - last_loss) < opt['stopping_eps']:
status = "Training loss converged"
break
else:
last_val_loss = val_loss
last_loss = cur_loss
print("After %d steps:\n\tTrainig Loss: %.4lf\n\tValidation Loss: %.4lf" % (step+1, l[-1], val_loss))
plt.plot(np.arange(len(l)), l)
return last_val_loss, status
'''
Get indices batch of size end - start
'''
def get_batch(nnz_elem, start, end):
return nnz_elem[:,start:end]
'''
Get count of pairs in a set
Arguments: indices: ndarray
Returns: dict with number of occurences of each unique id found in indices
'''
def get_count(indices):
unique, counts = np.unique(indices, return_counts=True)
to_dict = dict(zip(unique, counts))
return to_dict
def SGD(M, Q, P, nnz_elem, val_set_indices, M_val, opt, timer=None):
l=[]
last_val_loss = np.inf
last_loss = np.inf
start = 0
end = 0
status = "Maximum steps reached"
iteration_measured = False
np.random.shuffle(nnz_elem.T)
for step in range(opt['steps']):
if not iteration_measured:
timer.start("ITERATION %s" % json.dumps(opt,indent=1))
start = end # update bounds for current batch
end += opt['batch_size']
delta_P = np.zeros_like(P)
delta_Q = np.zeros_like(Q)
batch = get_batch(nnz_elem, start, end) # contains indices for current batch
for idx in range(batch.shape[1]):
i = batch[0,idx]
j = batch[1,idx]
eij = M[i,j] - np.dot(Q[i,:], P[:,j])
for k in range(opt['factors']):
delta_P[k,j] += (get_count(nnz_elem[1])[j]/get_count(batch[1])[j]) * opt['learning_rate'] * 2 * (eij * Q[i,k] - opt['lambda_1'] * P[k,j])
delta_Q[i,k] += (get_count(nnz_elem[0])[i]/get_count(batch[0])[i]) * opt['learning_rate'] * 2 * (eij * P[k,j] - opt['lambda_2'] * Q[i,k])
P += delta_P
Q += delta_Q
if not iteration_measured:
timer.end()
iteration_measured = True
if (step) % opt['epoch'] == 0:
cur_loss = loss(M=M, Q=Q, P=P, nnz_elem=nnz_elem, opt=opt)
l.append(cur_loss)
val_loss = validation_loss(val_set_indices=val_set_indices, M_val=M_val, Q=Q, P=P, opt=opt)
if val_loss > last_val_loss:
status = "Validation loss increased"
break
elif abs(cur_loss - last_loss) < opt['stopping_eps']:
status = "Training loss converged"
break
else:
last_val_loss = val_loss
last_loss = cur_loss
print("After %d steps:\n\tTrainig Loss: %.4lf\n\tValidation Loss: %.4lf" % (step+1, l[-1], val_loss))
plt.plot(np.arange(len(l)), l)
return last_val_loss, status
def set_optimizer_options(name, learning_rate, lambda_1, lambda_2, steps, epoch, factors, batch_size, stopping_eps):
options = {
'optimizer' : name,
'learning_rate' : learning_rate,
'lambda_1' : lambda_1,
'lambda_2' : lambda_2,
'steps' : steps,
'epoch' : epoch,
'factors' : factors,
'batch_size' : batch_size,
'stopping_eps' : stopping_eps
}
return options
def init(M,opt):
U, S, Vt = svds(M,k=opt['factors'])
Q = U*S
P = Vt
nz_i_indices, nz_x_indices = M.nonzero()
nz_indices = np.vstack((nz_i_indices, nz_x_indices))
return Q, P, nz_i_indices, nz_x_indices, nz_indices
'''
def train_hyperparameters(optimizer_func, M, Q, P, nnz_elem, opt, timer):
timer.start("%s" % json.dumps(opt, indent=1))
optimizer_func(M,Q,P,nnz_elem,opt)
timer.end()
pass
'''
Out[11]:
In [12]:
# Creater timer object
timer = Timer()
In [22]:
# Standard Gradient Descent
opt = set_optimizer_options(name='GradientDescent',
learning_rate=0.0001,
lambda_1=0.5,
lambda_2=0.5,
steps=5,
epoch=1,
factors=30,
batch_size=None,
stopping_eps=0.001)
Q, P, nz_i_indices, nz_x_indices, nz_indices = init(M=M,opt=opt)
val_loss, status = GD(M=M, Q=Q, P=P, nnz_elem=nz_indices, val_set_indices=val_set_indices, M_val=M_val, opt=opt, timer=timer)
err = test_error(test_set_indices=test_set_indices, M_test=M_test, Q=Q, P=P)
print("STATUS: %s\nVALIDATION LOSS: %.4lf\nTEST ERROR: %.10lf" % (status, val_loss, err))
In [20]:
# Stochastic Gradient Descent
opt = set_optimizer_options(name='StochasticGradientDescent',
learning_rate=0.00009,
lambda_1=0.5,
lambda_2=0.5,
steps=500,
epoch=10,
factors=30,
batch_size=1,
stopping_eps=0.001)
#Q, P, nz_i_indices, nz_x_indices, nz_indices = init(M,opt)
val_loss, status = SGD(M=M, Q=Q, P=P, nnz_elem=nz_indices, val_set_indices=val_set_indices, M_val=M_val, opt=opt, timer=timer)
err = test_error(test_set_indices=test_set_indices, M_test=M_test, Q=Q, P=P)
print("STATUS: %s\nVALIDATION LOSS: %.4lf\nTEST ERROR: %.10lf" % (status, val_loss, err))
In [15]:
'''
Parameters to SGD with batch size greater than 1.
'''
opt = set_optimizer_options(name='StochasticGradientDescent',
learning_rate=0.0003,
lambda_1=0.5,
lambda_2=0.5,
steps=30,
epoch=5,
factors=30,
batch_size=100,
stopping_eps=0.001)
#Q, P, nz_i_indices, nz_x_indices, nz_indices = init(M,opt)
val_loss, status = SGD(M=M, Q=Q, P=P, nnz_elem=nz_indices, val_set_indices=val_set_indices, M_val=M_val, opt=opt, timer=timer)
err = test_error(test_set_indices=test_set_indices, M_test=M_test, Q=Q, P=P)
print("STATUS: %s\nVALIDATION LOSS: %.4lf\nTEST ERROR: %.10lf" % (status, val_loss, err))
In [ ]: